library(edgeR)
library(ggplot2)
library(readr)
library(here)

# Create output directories if they don't exist
dir.create(file.path("edgeR"), showWarnings = FALSE)
dir.create(file.path("edgeR_transcript"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path("edgeR_gene"), showWarnings = FALSE, recursive = TRUE)

Transcripts

Load count data - Transcripts

counts <- read.csv("/Volumes/sheynkman/projects/LRP_Mohi_project/03_filter_sqanti/MDS_filtered_raw_counts.tsv", 
                   header = TRUE, row.names = 1)

group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))

dge_raw <- DGEList(counts = counts, group = group)

Visualize raw counts

boxplot(dge_raw$counts, main = "Boxplot of Raw Counts", las = 2, col = as.numeric(group))

plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Counts")

plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)

MD (mean-difference) plots per sample

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
  plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
}

par(mfrow = c(1, 1)) # Reset plotting area to single panel

Filter lowly expressed transcripts (required for modeling)

keep <- filterByExpr(dge_raw)
table(keep)
## keep
##  FALSE   TRUE 
## 292011  37723
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]

Normalize Counts with TMM

dge <- calcNormFactors(dge, method = "TMM")

Save counts matrices

# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_transcript", "dte_raw.rda"))
save(dge,      file = file.path("edgeR_transcript", "dte_norm.rda"))

Visualize normalized counts

boxplot(dge$counts, main = "Boxplot of Normalized Counts")

Log-transform and boxplot: Transcript-level CPM

log_cpm_tx <- cpm(dge, log = TRUE, prior.count = 1)

boxplot(log_cpm_tx,
        main = "Boxplot of log2(CPM + 1) - Transcripts",
        las = 2,
        col = as.numeric(group))

, las = 2, col = as.numeric(group)) plotMDS(dge, col = as.numeric(group), main = “MDS Plot for Normalized Counts”) plotMD(dge, col = as.numeric(group), main = “MD Plot for Normalized Counts”) abline(h = 0, col = “red”, lty = 2, lwd = 2)


Design matrix and dispersion

``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)

dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.07809944
plotBCV(dge_disp)

Fit model and test

fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)

result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient:  groupWT 
##                logFC   logCPM         F       PValue        FDR
## PB.21599.2  2.592444 4.875611 121.33363 4.501609e-07 0.01590197
## PB.9626.41  2.657393 3.640055 106.58645 8.430913e-07 0.01590197
## PB.17614.66 5.520253 1.697234  61.15331 2.148668e-06 0.01611877
## PB.20180.8  2.300030 4.451426  84.60091 2.509844e-06 0.01611877
## PB.16163.1  2.290359 5.648489  82.23354 2.864402e-06 0.01611877
## PB.11107.2  2.308405 4.354890  81.53789 2.982938e-06 0.01611877
## PB.11890.1  2.275614 4.935824  79.41331 3.371832e-06 0.01611877
## PB.2313.1   2.059830 7.696301  74.94752 4.409125e-06 0.01611877
## PB.24052.20 8.827273 2.260153 107.02258 4.458135e-06 0.01611877
## PB.3791.1   2.301812 3.958942  73.81950 4.737358e-06 0.01611877

FDR and DEG Summary

FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 104
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
##             BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.21599.2     7.893988    7.636693     9.13440   42.179837   59.923100
## PB.9626.41     3.633741    3.369129     2.89256   20.906528   19.708042
## PB.17614.66    0.000000    0.000000     0.30448    4.768155    5.459660
## PB.20180.8     6.390371    7.187475     8.22096   28.058761   39.016596
## PB.16163.1    17.542196   11.679647    21.16136   69.321645   95.477472
## PB.11107.2     7.016878    4.941389     8.06872   29.342495   36.885997
## PB.11890.1    11.026523    8.085910    11.72248   39.795759   48.337967
## PB.2313.1     92.221831   62.665801    85.10216  234.006400  348.619278
## PB.24052.20    0.000000    0.000000     0.00000    6.051890    8.389234
## PB.3791.1      4.134946    4.492172     6.69856   24.024168   24.501890
##             BioSample_6
## PB.21599.2    47.087966
## PB.9626.41    22.262678
## PB.17614.66    6.887015
## PB.20180.8    40.200951
## PB.16163.1    83.445002
## PB.11107.2    34.274914
## PB.11890.1    62.463629
## PB.2313.1    419.467292
## PB.24052.20   12.172399
## PB.3791.1     27.387899
summary(decideTests(qlf))
##        groupWT
## Down         7
## NotSig   37619
## Up          97
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")

Save DEG results & intermediate files

deg_results <- topTags(result, n = Inf)$table
write.table(deg_results, 
            file = file.path("edgeR_transcript", "transcript_DEG_results.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

# Raw and normalized CPM Matrices
write.table(cpm(dge_raw), 
            file = file.path("edgeR_transcript", "raw_CPM_matrix.txt"), 
            sep = "\t", quote = FALSE)
write.table(cpm(dge), 
            file = file.path("edgeR_transcript", "normalized_CPM_matrix.txt"), 
            sep = "\t", quote = FALSE)

# List of filtered-in transcripts
write.table(rownames(dge), 
            file = file.path("edgeR_transcript", "filtered_transcripts.txt"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)

# Design matrix to keep record of the model used 
write.table(design, 
            file = file.path("edgeR_transcript", "design_matrix.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)

# Top genes table 
top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes, 
            file = file.path("edgeR_transcript", "top_transcripts.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

# Save dispersion plots (BCV and QL)
png(file.path("edgeR_transcript", "BCV_plot.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_transcript", "QLDisp_plot.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen 
##                 2
# MDS plot (normalized)
png(file.path("edgeR_transcript", "MDS_plot.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot for Normalized Counts")
dev.off()
## quartz_off_screen 
##                 2

Volcano Plot - left = downregulated in Q157R vs. WT (higher in WT); right = up-regulated in Q157R vs. WT (higher in Q157R)

deg_results$Gene <- rownames(deg_results)

ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) + 
  geom_point(aes(color = FDR < 0.05)) + 
  scale_color_manual(values = c("black", "red")) + 
  theme_minimal() + 
  labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

ggsave(file.path("edgeR_transcript", "transcript_volcano_plot.png"), 
       width = 6, height = 6, dpi = 300)

Interactive volcano plot

library(ggiraph)
library(plotly)

gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
                              tooltip = Gene, data_id = Gene)) +
  geom_point_interactive(aes(color = FDR < 0.05)) +
  scale_color_manual(values = c("black", "red")) +
  theme_minimal() +
  labs(title = "Interactive Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

girafe(ggobj = gg, width_svg = 10, height_svg = 6)

Save count matrices

# Transcript raw counts
write.table(dge_raw$counts, 
            file.path("edgeR_transcript", "raw_counts_transcripts.txt"), 
            sep = "\t", quote = FALSE)

# Transcript normalized counts
write.table(cpm(dge), 
            file.path("edgeR_transcript", "normalized_counts_transcripts.txt"), 
            sep = "\t", quote = FALSE)

Genes

counts_gene <- read.delim("raw_gene_counts_matrix.txt", 
                          header = TRUE, sep = "\t", stringsAsFactors = FALSE)

# Use 'gene_id' as rownames
rownames(counts_gene) <- counts_gene$gene_id
counts_gene$gene_id <- NULL

counts <- as.data.frame(counts_gene)

group <- factor(c("Q157R", "Q157R", "Q157R", "WT", "WT", "WT"))
dge_raw <- DGEList(counts = counts, group = group)

Visualize raw counts - Genes

boxplot(dge_raw$counts, main = "Boxplot of Raw Gene Counts", las = 2, col = as.numeric(group))

plotMDS(dge_raw, col = as.numeric(group), main = "MDS Plot for Raw Gene Counts")

plotMD(dge_raw, col = as.numeric(group), main = "MD Plot for Raw Gene Counts")
abline(h = 0, col = "red", lty = 2, lwd = 2)

MD plots per sample - Genes

par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
for (i in 1:ncol(dge_raw$counts)) {
  plotMD(dge_raw, column = i, main = paste("MD Plot for Sample", i))
  abline(h = 0, col = "red", lty = 2, lwd = 2)
}

par(mfrow = c(1, 1))

Filter lowly expressed genes

keep <- filterByExpr(dge_raw)
table(keep)
## keep
## FALSE  TRUE 
##  2565  9925
dge <- dge_raw[keep, , keep.lib.sizes = FALSE]

Normalize Counts with TMM - Genes

dge <- calcNormFactors(dge, method = "TMM")

Save counts matrices - Genes

# Save dge_raw and dge_norm objects for downstream correlation analyses
save(dge_raw, file = file.path("edgeR_gene", "dge_raw.rda"))
save(dge,      file = file.path("edgeR_gene", "dge_norm.rda"))

Visualize normalized counts - Genes

boxplot(dge$counts, main = "Boxplot of Normalized Gene Counts")

Log-transform and boxplot: Gene-level CPM

log_cpm_gene <- cpm(dge, log = TRUE, prior.count = 1)

boxplot(log_cpm_gene,
        main = "Boxplot of log2(CPM + 1) - Genes",
        las = 2,
        col = as.numeric(group))

, las = 2, col = as.numeric(group)) plotMDS(dge, col = as.numeric(group), main = “MDS Plot for Normalized Gene Counts”) plotMD(dge, col = as.numeric(group), main = “MD Plot for Normalized Gene Counts”) abline(h = 0, col = “red”, lty = 2, lwd = 2)


Design matrix and dispersion - Genes

``` r
design <- model.matrix(~ group)
rownames(design) <- colnames(dge)

dge_disp <- estimateDisp(dge, design)
dge_disp$common.dispersion
## [1] 0.05120293
plotBCV(dge_disp)

Fit model and test - Genes

fit <- glmQLFit(dge_disp, design)
plotQLDisp(fit)

result <- glmQLFTest(fit, coef = 2)
topTags(result)
## Coefficient:  groupWT 
##             logFC   logCPM        F       PValue        FDR
## PB.11107 1.824440 4.494571 98.55092 6.454804e-06 0.02552083
## PB.16163 2.209439 5.580095 90.23341 9.081392e-06 0.02552083
## PB.3791  2.264203 3.952958 89.77741 9.282103e-06 0.02552083
## PB.11890 1.914397 5.101631 83.92693 1.200963e-05 0.02552083
## PB.21599 1.642828 5.118482 72.95964 2.050614e-05 0.02552083
## PB.2313  2.023493 7.626385 71.52524 2.210383e-05 0.02552083
## PB.18006 2.114723 4.507783 70.59935 2.324456e-05 0.02552083
## PB.10619 1.600779 4.703694 70.56518 2.327693e-05 0.02552083
## PB.7848  2.263158 3.783320 67.35188 2.782043e-05 0.02552083
## PB.14007 2.105173 5.321121 67.24421 2.791982e-05 0.02552083

FDR and DEG Summary - Genes

FDR <- p.adjust(result$table$PValue, method = "BH")
sum(FDR < 0.05)
## [1] 65
qlf <- glmQLFTest(fit)
top <- rownames(topTags(qlf))
cpm(dge_disp)[top, ]
##          BioSample_1 BioSample_2 BioSample_3 BioSample_4 BioSample_5
## PB.11107    9.654665    9.259909   10.329900    30.88463    37.82332
## PB.16163   17.998202   11.628722   20.376789    65.33287    90.85137
## PB.3791     4.410155    4.306934    6.792263    23.07862    24.88046
## PB.11890   15.495141   12.705456   14.292053    43.61181    51.64580
## PB.21599   16.925462   15.289617   17.688185    43.61181    62.32681
## PB.2313    91.182944   62.235200   79.526078   219.75602   333.62433
## PB.18006    9.058698    7.967828    8.065812    32.07250    48.12735
## PB.10619   13.826433   11.628722   12.593987    34.44824    36.56673
## PB.7848     4.529349    4.522281    4.811186    17.13927    29.27852
## PB.14007   18.355782   10.982682   14.858075    48.53299    63.83471
##          BioSample_6
## PB.11107    35.12589
## PB.16163    78.35775
## PB.3791     27.32014
## PB.11890    66.04868
## PB.21599    50.28706
## PB.2313    396.74241
## PB.18006    28.82124
## PB.10619    45.18330
## PB.7848     19.96471
## PB.14007    80.90963
summary(decideTests(qlf))
##        groupWT
## Down         2
## NotSig    9860
## Up          63
plotMD(qlf)
abline(h = c(-1, 1), col = "blue")

Save DEG results & intermediate files - Genes

deg_results <- topTags(result, n = Inf)$table
write.table(deg_results, 
            file.path("edgeR_gene", "gene_DEG_results.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

write.table(cpm(dge_raw), 
            file.path("edgeR_gene", "raw_CPM_matrix_gene.txt"), 
            sep = "\t", quote = FALSE)
write.table(cpm(dge), 
            file.path("edgeR_gene", "normalized_CPM_matrix_gene.txt"), 
            sep = "\t", quote = FALSE)
write.table(rownames(dge), 
            file.path("edgeR_gene", "filtered_genes.txt"), 
            quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(design, 
            file.path("edgeR_gene", "design_matrix_gene.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE, col.names = NA)

top_genes <- topTags(qlf, n = 50)$table
write.table(top_genes, 
            file.path("edgeR_gene", "top_genes.txt"), 
            sep = "\t", quote = FALSE, row.names = TRUE)

png(file.path("edgeR_gene", "BCV_plot_gene.png"), width = 800, height = 600)
plotBCV(dge_disp)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_gene", "QLDisp_plot_gene.png"), width = 800, height = 600)
plotQLDisp(fit)
dev.off()
## quartz_off_screen 
##                 2
png(file.path("edgeR_gene", "MDS_plot_gene.png"), width = 800, height = 600)
plotMDS(dge, col = as.numeric(group), main = "MDS Plot (Normalized Genes)")
dev.off()
## quartz_off_screen 
##                 2

Volcano Plot - Genes

deg_results$Gene <- rownames(deg_results)

ggplot(deg_results, aes(x = logFC, y = -log10(PValue))) + 
  geom_point(aes(color = FDR < 0.05)) + 
  scale_color_manual(values = c("black", "red")) + 
  theme_minimal() + 
  labs(title = "Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

ggsave(file.path("edgeR_gene", "gene_volcano_plot.png"), 
       width = 6, height = 6, dpi = 300)

Interactive volcano plot - Genes

gg <- ggplot(deg_results, aes(x = logFC, y = -log10(PValue),
                              tooltip = Gene, data_id = Gene)) +
  geom_point_interactive(aes(color = FDR < 0.05)) +
  scale_color_manual(values = c("black", "red")) +
  theme_minimal() +
  labs(title = "Interactive Gene Volcano Plot", x = "Log2 Fold Change", y = "-Log10 P-value")

girafe(ggobj = gg, width_svg = 10, height_svg = 6)

Save count matrices

# Gene raw counts
write.table(dge_raw$counts, 
            file.path("edgeR_gene", "raw_counts_genes.txt"), 
            sep = "\t", quote = FALSE)

# Gene normalized counts
write.table(cpm(dge), 
            file.path("edgeR_gene", "normalized_counts_genes.txt"), 
            sep = "\t", quote = FALSE)